Analyzing Health Science Data and Travel Patterns

This analysis focuses on deaths due to pneumonia, flu, or COVID-19 in U.S. states between 2020 and 2023. We supplement the analysis with travel data from quarterly reports to identify possible correlations between these health outcomes and travel behavior.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Loading and Cleaning Data

We start by loading the health science dataset and removing rows with missing values to ensure clean data for downstream analysis.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(corrplot)
## corrplot 0.94 loaded
library(dplyr)
library(readr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
health <- read_csv("HealthScienceData.csv")
## Rows: 50400 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Data As Of, Start Week, End Week, Week Ending Date, Group, Indicato...
## dbl (8): MMWRyear, MMWRweek, COVID-19 Deaths, Total Deaths, Pneumonia Deaths...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
health
## # A tibble: 50,400 × 16
##    `Data As Of` `Start Week` `End Week` MMWRyear MMWRweek `Week Ending Date`
##    <chr>        <chr>        <chr>         <dbl>    <dbl> <chr>             
##  1 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  2 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  3 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  4 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  5 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  6 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  7 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  8 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  9 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
## 10 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
## # ℹ 50,390 more rows
## # ℹ 10 more variables: Group <chr>, Indicator <chr>, Jurisdiction <chr>,
## #   `Age Group` <chr>, `COVID-19 Deaths` <dbl>, `Total Deaths` <dbl>,
## #   `Pneumonia Deaths` <dbl>, `Influenza Deaths` <dbl>,
## #   `Pneumonia or Influenza` <dbl>,
## #   `Pneumonia, Influenza, or COVID-19 Deaths` <dbl>
health_full_clean <- na.omit(health)
health_full_clean
## # A tibble: 20,685 × 16
##    `Data As Of` `Start Week` `End Week` MMWRyear MMWRweek `Week Ending Date`
##    <chr>        <chr>        <chr>         <dbl>    <dbl> <chr>             
##  1 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  2 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  3 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  4 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  5 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  6 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  7 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  8 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
##  9 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
## 10 11/02/2023   12/29/2019   01/04/2020     2020        1 01/04/2020        
## # ℹ 20,675 more rows
## # ℹ 10 more variables: Group <chr>, Indicator <chr>, Jurisdiction <chr>,
## #   `Age Group` <chr>, `COVID-19 Deaths` <dbl>, `Total Deaths` <dbl>,
## #   `Pneumonia Deaths` <dbl>, `Influenza Deaths` <dbl>,
## #   `Pneumonia or Influenza` <dbl>,
## #   `Pneumonia, Influenza, or COVID-19 Deaths` <dbl>
filtered <- health_full_clean[health_full_clean$`Age Group` != "All Ages", ]
filtered$Quarter <- ifelse(filtered$MMWRweek <= 13, 1,
                     ifelse(filtered$MMWRweek <= 25, 2,
                     ifelse(filtered$MMWRweek <= 38, 3, 4)))
colnames(filtered) <- gsub(" ", "", colnames(filtered))
colnames(filtered) <- gsub("-", "", colnames(filtered))
colnames(filtered) <- gsub(",", "_", colnames(filtered))
filtered$AgeGroup <- factor(filtered$AgeGroup)
filtered
## # A tibble: 14,672 × 17
##    DataAsOf   StartWeek EndWeek MMWRyear MMWRweek WeekEndingDate Group Indicator
##    <chr>      <chr>     <chr>      <dbl>    <dbl> <chr>          <chr> <chr>    
##  1 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  2 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  3 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  4 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  5 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  6 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  7 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  8 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  9 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
## 10 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
## # ℹ 14,662 more rows
## # ℹ 9 more variables: Jurisdiction <chr>, AgeGroup <fct>, COVID19Deaths <dbl>,
## #   TotalDeaths <dbl>, PneumoniaDeaths <dbl>, InfluenzaDeaths <dbl>,
## #   PneumoniaorInfluenza <dbl>, Pneumonia_Influenza_orCOVID19Deaths <dbl>,
## #   Quarter <dbl>
filtered2 <- filtered[!filtered$Jurisdiction %in% c("United States", "HHS Region 1", "HHS Region 2", "HHS Region 3","HHS Region 4", "HHS Region 5", "HHS Region 6", "HHS Region 7", "HHS Region 8", "HHS Region 9", "HHS Region 10"), ]
filtered2
## # A tibble: 11,703 × 17
##    DataAsOf   StartWeek EndWeek MMWRyear MMWRweek WeekEndingDate Group Indicator
##    <chr>      <chr>     <chr>      <dbl>    <dbl> <chr>          <chr> <chr>    
##  1 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  2 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  3 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  4 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  5 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  6 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  7 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  8 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
##  9 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
## 10 11/02/2023 12/29/20… 01/04/…     2020        1 01/04/2020     By W… Week-end…
## # ℹ 11,693 more rows
## # ℹ 9 more variables: Jurisdiction <chr>, AgeGroup <fct>, COVID19Deaths <dbl>,
## #   TotalDeaths <dbl>, PneumoniaDeaths <dbl>, InfluenzaDeaths <dbl>,
## #   PneumoniaorInfluenza <dbl>, Pneumonia_Influenza_orCOVID19Deaths <dbl>,
## #   Quarter <dbl>

Exploratory Data Analysis

Before modeling, we explore the dataset to understand the deaths caused by pneumonia, flu, or COVID-19, and how these vary by state and time period. We use quarters as time periods to prepare for analysis with flight passenger data.

par(mfrow = c(2, 3)) 
for (var in names(filtered2)[c(11, 12, 13, 14, 15, 16)]) {
  boxplot(as.formula(paste(var, "~ AgeGroup")), data = filtered2, main = var)
}

par(mfrow = c(2, 3)) 
for (var in names(filtered2)[c(11, 12, 13, 14, 15, 16)]) {
  boxplot(as.formula(paste(var, "~ Jurisdiction")), data = filtered2, main = var)
}

par(mfrow = c(2, 3)) 
for (var in names(filtered2)[c(11, 12, 13, 14, 15, 16)]) {
  boxplot(as.formula(paste(var, "~ MMWRyear")), data = filtered2, main = var)
}

library(dplyr)

health %>%
  ungroup() %>%
  arrange(desc((as.numeric('Total Deaths')))) %>%
  select('Jurisdiction', 'Total Deaths', 'MMWRyear')
## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = (as.numeric("Total Deaths"))`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 50,400 × 3
##    Jurisdiction  `Total Deaths` MMWRyear
##    <chr>                  <dbl>    <dbl>
##  1 United States          60028     2020
##  2 United States            667     2020
##  3 United States          14706     2020
##  4 United States          44655     2020
##  5 Alabama                 1098     2020
##  6 Alabama                   16     2020
##  7 Alabama                  297     2020
##  8 Alabama                  785     2020
##  9 Alaska                    91     2020
## 10 Alaska                    NA     2020
## # ℹ 50,390 more rows

Visualizing Health Outcomes

We create visualizations such as bar charts and line graphs to display trends in health-related deaths across states. These help to identify patterns over the quarters.

total_deaths_by_quarter <- filtered2 %>%
  group_by(Quarter) %>%
  summarize(TotalDeaths = sum(TotalDeaths))

ggplot(total_deaths_by_quarter, aes(x = Quarter, y = TotalDeaths)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  labs(title = "Total Deaths by Quarter", x = "Quarter", y = "Total Deaths") +
  theme_minimal()

filtered_region <- filtered[filtered$Jurisdiction %in% c("HHS Region 1", "HHS Region 2", "HHS Region 3","HHS Region 4", "HHS Region 5", "HHS Region 6", "HHS Region 7", "HHS Region 8", "HHS Region 9", "HHS Region 10"), ]

filtered_region$Jurisdiction <- factor(filtered_region$Jurisdiction, levels = c("HHS Region 1", "HHS Region 2", "HHS Region 3","HHS Region 4", "HHS Region 5", "HHS Region 6", "HHS Region 7", "HHS Region 8", "HHS Region 9", "HHS Region 10"))

total_deaths_by_region_and_quarter <- filtered_region %>%
  group_by(Jurisdiction, Quarter) %>%
  summarize(TotalDeaths = sum(Pneumonia_Influenza_orCOVID19Deaths, na.rm = TRUE))
## `summarise()` has grouped output by 'Jurisdiction'. You can override using the
## `.groups` argument.
ggplot(total_deaths_by_region_and_quarter, aes(x = Quarter, y = TotalDeaths, fill = Jurisdiction)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Deaths by Quarter and Region", x = "Quarter", y = "Total Deaths") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

total_deaths_by_year_quarter <- filtered_region %>%
  group_by(MMWRyear, Quarter, Jurisdiction) %>%
  summarize(TotalDeaths = sum(Pneumonia_Influenza_orCOVID19Deaths, na.rm = TRUE), .groups = 'drop')

years <- unique(total_deaths_by_year_quarter$MMWRyear)
plot_list <- list()

for (year in years) {
  year_data <- total_deaths_by_year_quarter %>%
    filter(MMWRyear == year)
  
  p <- ggplot(year_data, aes(x = Quarter, y = TotalDeaths, fill = Jurisdiction)) +
    geom_bar(stat = "identity", position = "dodge") + 
    labs(title = paste("Total Deaths by Quarter and Region in", year),
         x = "Quarter",
         y = "Total Deaths") +
    theme_minimal() +
    scale_fill_brewer(palette = "Set3")
  
  plot_list[[as.character(year)]] <- p
}
grid.arrange(grobs = plot_list, ncol = 2, nrow = 2)

disease_columns <- c("COVID19Deaths", "PneumoniaDeaths", "InfluenzaDeaths")
par(mfrow = c(3, 4))
for (disease in disease_columns) {
  deaths_by_year_quarter <- filtered_region %>%
    group_by(MMWRyear, Quarter, Jurisdiction) %>%
    summarize(DiseaseDeaths = sum(.data[[disease]], na.rm = TRUE), .groups = 'drop')
  years <- unique(total_deaths_by_year_quarter$MMWRyear)

for (year in years) {
  year_data <- total_deaths_by_year_quarter %>%
    filter(MMWRyear == year)
  
  p <- ggplot(year_data, aes(x = Quarter, y = TotalDeaths, fill = Jurisdiction)) +
    geom_bar(stat = "identity", position = "dodge") + 
    labs(title = paste("Total Deaths by",  disease, "by Quarter and Region in", year),
         x = "Quarter",
         y = "Total Deaths") +
    theme_minimal() +
    scale_fill_brewer(palette = "Set3")
  
  print(p)
}}

disease_columns <- c("COVID19Deaths", "PneumoniaDeaths", "InfluenzaDeaths")

for (year in c(2020, 2021, 2022, 2023)) {

  plot_list <- list()
  
  for (disease in disease_columns) {

    deaths_by_year_quarter <- filtered_region %>%
      group_by(MMWRyear, Quarter, Jurisdiction) %>%
      summarize(DiseaseDeaths = sum(.data[[disease]], na.rm = TRUE), .groups = 'drop')
    year_data <- deaths_by_year_quarter %>%
      filter(MMWRyear == year)

    p <- ggplot(year_data, aes(x = Quarter, y = DiseaseDeaths, fill = Jurisdiction)) +
      geom_bar(stat = "identity", position = "dodge") + 
      labs(title = paste("Total Deaths by", disease, "in", year),
           x = "Quarter",
           y = "Total Deaths") +
      theme_minimal() +
      scale_fill_brewer(palette = "Set3")
    plot_list[[disease]] <- p
  }

  grid.arrange(grobs = plot_list, ncol = 3)
}

Flight and Travel Data

Next, we look at the filtered dataset containing flight data and passengers. We want to look at the destinations of flights and the number of passengers going to these destinations. We use bar graphs and analyze this data by HHS region as well, so that we can attempt to correlate with the health outcomes data.

business_data <- read_csv("FilteredBusinessDataset.csv")
## Rows: 30875 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): tbl, city1, city2, airport_1, airport_2, carrier_lg, carrier_low, ...
## dbl (13): Year, quarter, citymarketid_1, citymarketid_2, airportid_1, airpor...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
flights <- business_data[order(business_data$Year, business_data$quarter),]
head(flights)
## # A tibble: 6 × 23
##   tbl      Year quarter citymarketid_1 citymarketid_2 city1    city2 airportid_1
##   <chr>   <dbl>   <dbl>          <dbl>          <dbl> <chr>    <chr>       <dbl>
## 1 Table1a  2020       1          30135          33195 Allento… Tamp…       10135
## 2 Table1a  2020       1          30135          33195 Allento… Tamp…       10135
## 3 Table1a  2020       1          30140          32575 Albuque… Los …       10140
## 4 Table1a  2020       1          30140          30852 Albuque… Wash…       10140
## 5 Table1a  2020       1          30140          30194 Albuque… Dall…       10140
## 6 Table1a  2020       1          30140          30852 Albuque… Wash…       10140
## # ℹ 15 more variables: airportid_2 <dbl>, airport_1 <chr>, airport_2 <chr>,
## #   nsmiles <dbl>, passengers <dbl>, fare <dbl>, carrier_lg <chr>,
## #   large_ms <dbl>, fare_lg <dbl>, carrier_low <chr>, lf_ms <dbl>,
## #   fare_low <dbl>, Geocoded_City1 <chr>, Geocoded_City2 <chr>, tbl1apk <chr>
flights2020 <- filter(flights, Year == 2020)
filtered_2020 <- flights2020[flights2020$quarter %in% c("1", "2", "3", "4"), ]

filtered_2020$quarter <- factor(filtered_2020$quarter, levels = c("1", "2", "3" ,"4"))

flights2020_by_quarter <- filtered_2020 %>%
  group_by(Year, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
ggplot(flights2020_by_quarter, aes(x = quarter, y = passengers/100000, fill = quarter)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter in the US in 2020", x = "Quarter", y = "Total Passengers in Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

flights2021 <- filter(flights, Year == 2021)
filtered_2021 <- flights2021[flights2021$quarter %in% c("1", "2", "3", "4"), ]

filtered_2021$quarter <- factor(filtered_2021$quarter, levels = c("1", "2", "3" ,"4"))

flights2021_by_quarter <- filtered_2021 %>%
  group_by(Year, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
ggplot(flights2021_by_quarter, aes(x = quarter, y = passengers/100000, fill = quarter)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter in the US in 2021", x = "Quarter", y = "Total Passengers by Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

flights2022 <- filter(flights, Year == 2022)
filtered_2022 <- flights2022[flights2022$quarter %in% c("1", "2", "3", "4"), ]

filtered_2022$quarter <- factor(filtered_2022$quarter, levels = c("1", "2", "3" ,"4"))

flights2022_by_quarter <- filtered_2022 %>%
  group_by(Year, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
ggplot(flights2022_by_quarter, aes(x = quarter, y = passengers/100000, fill = quarter)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter in the US in 2022", x = "Quarter", y = "Total Passengers by Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

flights2023 <- filter(flights, Year == 2023)
filtered_2023 <- flights2023[flights2023$quarter %in% c("1", "2", "3", "4"), ]

filtered_2023$quarter <- factor(filtered_2023$quarter, levels = c("1", "2", "3" ,"4"))

flights2023_by_quarter <- filtered_2023 %>%
  group_by(Year, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
ggplot(flights2023_by_quarter, aes(x = quarter, y = passengers/100000, fill = quarter)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter in the US in 2023", x = "Quarter", y = "Total Passengers by Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

Create the dest_state dataframe that found the states of the destination cities in the flight data.

Create the region_dest_state dataframe to divid each state into HHS regions taht correspond with the CDC data.

dest <- flights %>%
  separate(city2, into = c("city_name", "dest_state"), sep = ",") %>%
  arrange(dest_state) 
      

dest$dest_state <- as.character(dest$dest_state)  


for (i in 1:length(dest$dest_state)) {
  dest$dest_state[i] <- sub("^\\s+", "", dest$dest_state[i])  
}




dest <- dest %>%
  
  mutate(HHSregion = case_when(
    dest_state %in% c("CT", "ME", "MA", "MA (Metropolitan Area)" ,"NH", "RI", "VT") ~ "HHS Region 1",
    dest_state %in% c("NJ", "NY", "NY (Metropolitan Area)") ~ "HHS Region 2",
    dest_state %in% c("DE", "DC (Metropolitan Area)", "MD", "PA", "VA","VA (Metropolitan Area)", "WV") ~ "HHS Region 3",
    dest_state %in% c("AL", "FL", "FL (Metropolitan Area)", "GA", "KY", "MS", "NC", "SC", "TN") ~ "HHS Region 4",
    dest_state %in% c("IL", "IN", "MI", "MN", "OH","OH (Metropolitan Area)", "WI") ~ "HHS Region 5",
    dest_state %in% c("AR", "LA", "NM", "OK", "TX") ~ "HHS Region 6",
    dest_state %in% c("IA", "KS", "MO", "NE") ~ "HHS Region 7",
    dest_state %in% c("CO", "MT", "ND", "SD", "UT", "WY") ~ "HHS Region 8",
    dest_state %in% c("AZ", "CA", "CA (Metropolitan Area)", "HI", "NV") ~ "HHS Region 9",
    dest_state %in% c("AK", "ID", "OR", "WA") ~ "HHS Region 10",
    TRUE ~ "Unknown Region" 
  ))
filtered_region_2020 <- filter(dest, Year == 2020)

filtered_region_2020$HHSregion <- factor(filtered_region_2020$HHSregion, levels = c("HHS Region 1", "HHS Region 2", "HHS Region 3","HHS Region 4", "HHS Region 5", "HHS Region 6", "HHS Region 7", "HHS Region 8", "HHS Region 9", "HHS Region 10"))

filtered_region_2020$quarter <- factor(filtered_region_2020$quarter, levels = c("1", "2", "3", "4"))

total_passengers_by_region_and_quarter2020 <- filtered_region_2020 %>%
  group_by(HHSregion, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'HHSregion'. You can override using the
## `.groups` argument.
ggplot(total_passengers_by_region_and_quarter2020, aes(x = quarter, y = passengers/100000, fill = HHSregion)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter and Region in 2020", x = "Quarter", y = "Total Passengers in Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

filtered_region_2021 <- filter(dest, Year == 2021)

filtered_region_2021$HHSregion <- factor(filtered_region_2021$HHSregion, levels = c("HHS Region 1", "HHS Region 2", "HHS Region 3","HHS Region 4", "HHS Region 5", "HHS Region 6", "HHS Region 7", "HHS Region 8", "HHS Region 9", "HHS Region 10"))

filtered_region_2021$quarter <- factor(filtered_region_2021$quarter, levels = c("1", "2", "3", "4"))

total_passengers_by_region_and_quarter2021 <- filtered_region_2021 %>%
  group_by(HHSregion, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'HHSregion'. You can override using the
## `.groups` argument.
ggplot(total_passengers_by_region_and_quarter2021, aes(x = quarter, y = passengers/100000, fill = HHSregion)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter and Region in 2021", x = "Quarter", y = "Total Passengers in Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

filtered_region_2022 <- filter(dest, Year == 2022)

filtered_region_2022$HHSregion <- factor(filtered_region_2022$HHSregion, levels = c("HHS Region 1", "HHS Region 2", "HHS Region 3","HHS Region 4", "HHS Region 5", "HHS Region 6", "HHS Region 7", "HHS Region 8", "HHS Region 9", "HHS Region 10"))

filtered_region_2022$quarter <- factor(filtered_region_2022$quarter, levels = c("1", "2", "3", "4"))

total_passengers_by_region_and_quarter2022 <- filtered_region_2022 %>%
  group_by(HHSregion, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'HHSregion'. You can override using the
## `.groups` argument.
ggplot(total_passengers_by_region_and_quarter2022, aes(x = quarter, y = passengers/100000, fill = HHSregion)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter and Region in 2022", x = "Quarter", y = "Total Passengers in Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

filtered_region_2023 <- filter(dest, Year == 2023)

filtered_region_2023$HHSregion <- factor(filtered_region_2023$HHSregion, levels = c("HHS Region 1", "HHS Region 2", "HHS Region 3","HHS Region 4", "HHS Region 5", "HHS Region 6", "HHS Region 7", "HHS Region 8", "HHS Region 9", "HHS Region 10"))

filtered_region_2023$quarter <- factor(filtered_region_2023$quarter, levels = c("1", "2", "3", "4"))

total_passengers_by_region_and_quarter2023 <- filtered_region_2023 %>%
  group_by(HHSregion, quarter) %>%
  summarize(passengers = sum(passengers, na.rm = TRUE))
## `summarise()` has grouped output by 'HHSregion'. You can override using the
## `.groups` argument.
ggplot(total_passengers_by_region_and_quarter2020, aes(x = quarter, y = passengers/100000, fill = HHSregion)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Total Flyers by Quarter and Region in 2020", x = "Quarter", y = "Total Passengers in Millions") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set3")

Correlation Analysis

Next, we attempt to correlate health-related deaths with travel data using linear regression models. This helps us see any potential relationships between travel and the deaths caused from flu, pneumonia, or COVID-19 deaths.

flights_aggregate <- dest %>%
  group_by(HHSregion, Year, quarter) %>%
  summarise(total_passengers = sum(passengers, na.rm = TRUE), .groups = 'drop')

health_aggregate <- filtered_region %>%
  group_by(Jurisdiction, MMWRyear, Quarter) %>%
  summarise(total_deaths = sum(TotalDeaths, na.rm = TRUE), .groups = 'drop')

merged_data <- merge(flights_aggregate, health_aggregate, 
                     by.x = c("HHSregion", "Year", "quarter"), 
                     by.y = c("Jurisdiction", "MMWRyear", "Quarter"), 
                     all = TRUE)

correlation_result <- cor(merged_data$total_passengers, merged_data$total_deaths, use = "complete.obs")
ggplot(merged_data, aes(x = total_passengers/100000, y = total_deaths/100000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation Between Passengers and Deaths by Region, Year, and Quarter", 
       x = "Total Passengers (in 100,000s)", 
       y = "Total Deaths (in 100,000s)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

year2020_data <- merged_data[merged_data$Year == 2020, ]
  
  ggplot(year2020_data, aes(x = total_passengers/100000, y = total_deaths/100000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation Between Passengers and Deaths by Region and Quarter, in 2020", 
       x = "Total Passengers (in 100,000s)", 
       y = "Total Deaths (in 100,000s)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

year2020_data
##         HHSregion Year quarter total_passengers total_deaths
## 1    HHS Region 1 2020       1             3401        23175
## 2    HHS Region 1 2020       2              464        18132
## 3    HHS Region 1 2020       3             1234        24109
## 4    HHS Region 1 2020       4             1219        25544
## 17  HHS Region 10 2020       1            27775        17162
## 18  HHS Region 10 2020       2             5710         9490
## 19  HHS Region 10 2020       3            13504        24391
## 20  HHS Region 10 2020       4            13407        25977
## 33   HHS Region 2 2020       1            80577        46686
## 34   HHS Region 2 2020       2             7463        79400
## 35   HHS Region 2 2020       3            19357        43365
## 36   HHS Region 2 2020       4            25413        18422
## 49   HHS Region 3 2020       1            84153        69311
## 50   HHS Region 3 2020       2            11952        39624
## 51   HHS Region 3 2020       3            29717        62806
## 52   HHS Region 3 2020       4            34534        30522
## 65   HHS Region 4 2020       1           124495       106338
## 66   HHS Region 4 2020       2            20361        52896
## 67   HHS Region 4 2020       3            37961        28385
## 68   HHS Region 4 2020       4            62270        44119
## 81   HHS Region 5 2020       1            26449       110525
## 82   HHS Region 5 2020       2             4215        54629
## 83   HHS Region 5 2020       3            10028        59975
## 84   HHS Region 5 2020       4            10931        59999
## 97   HHS Region 6 2020       1            38125        70098
## 98   HHS Region 6 2020       2             7111         8489
## 99   HHS Region 6 2020       3            15056        28252
## 100  HHS Region 6 2020       4            20400        12143
## 113  HHS Region 7 2020       1            10569        22266
## 114  HHS Region 7 2020       2             1928        17802
## 115  HHS Region 7 2020       3             4573        29104
## 116  HHS Region 7 2020       4             5096        39361
## 129  HHS Region 8 2020       1            17954        11088
## 130  HHS Region 8 2020       2             3100        12206
## 131  HHS Region 8 2020       3             8645        21118
## 132  HHS Region 8 2020       4             8616        26696
## 145  HHS Region 9 2020       1           169019        72935
## 146  HHS Region 9 2020       2            27150        28094
## 147  HHS Region 9 2020       3            62408        84024
## 148  HHS Region 9 2020       4            75171        63182
year2021_data <- merged_data[merged_data$Year == 2021, ]
  
  ggplot(year2021_data, aes(x = total_passengers/100000, y = total_deaths/100000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation Between Passengers and Deaths by Region and Quarter in year 2021", 
       x = "Total Passengers (in 100,000s)", 
       y = "Total Deaths (in 100,000s)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

year2022_data <- merged_data[merged_data$Year == 2022, ]
  
  ggplot(year2022_data, aes(x = total_passengers/100000, y = total_deaths/100000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation Between Passengers and Deaths by Region and Quarter in year 2022", 
       x = "Total Passengers (in 100,000s)", 
       y = "Total Deaths (in 100,000s)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

year2023_data <- merged_data[merged_data$Year == 2023, ]
  
  ggplot(year2023_data, aes(x = total_passengers/100000, y = total_deaths/100000)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Correlation Between Passengers and Deaths by Region and Quarter in year 2023", 
       x = "Total Passengers (in 100,000s)", 
       y = "Total Deaths (in 100,000s)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

library(ggcorrplot)

merged_data_numeric <- merged_data %>%
  mutate(
    region_numeric = as.numeric(factor(HHSregion)),
    quarter = as.numeric(quarter)
  ) %>%
  select(quarter, region_numeric, total_passengers, total_deaths)

correlation_matrix <- cor(merged_data_numeric, use = "complete.obs")

print(correlation_matrix)
##                      quarter region_numeric total_passengers total_deaths
## quarter           1.00000000     0.00000000       0.05614626  -0.05382991
## region_numeric    0.00000000     1.00000000       0.21867022   0.04780133
## total_passengers  0.05614626     0.21867022       1.00000000   0.25608464
## total_deaths     -0.05382991     0.04780133       0.25608464   1.00000000
ggcorrplot(correlation_matrix, lab = TRUE, title = "Correlation Matrix")

model_passenger <- lm(total_deaths ~ total_passengers, data = merged_data)
summary(model_passenger)
## 
## Call:
## lm(formula = total_deaths ~ total_passengers, data = merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -50228 -16934  -5623  11920 121949 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.594e+04  3.044e+03   8.519  1.2e-14 ***
## total_passengers 1.196e-01  3.591e-02   3.330  0.00108 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27630 on 158 degrees of freedom
## Multiple R-squared:  0.06558,    Adjusted R-squared:  0.05967 
## F-statistic: 11.09 on 1 and 158 DF,  p-value: 0.001081
model_year <- lm(total_deaths ~ Year, data = merged_data)
summary(model_year)
## 
## Call:
## lm(formula = total_deaths ~ Year, data = merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34133 -16970  -9962   9241 138006 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 13004851    3953544   3.289  0.00124 **
## Year           -6417       1956  -3.281  0.00127 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27660 on 158 degrees of freedom
## Multiple R-squared:  0.06379,    Adjusted R-squared:  0.05786 
## F-statistic: 10.77 on 1 and 158 DF,  p-value: 0.001273
model_region <- lm(total_deaths ~ factor(HHSregion), data = merged_data)
summary(model_region)
## 
## Call:
## lm(formula = total_deaths ~ factor(HHSregion), data = merged_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -55146 -12950  -1038   8414 107311 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     18919.4     5988.5   3.159  0.00191 ** 
## factor(HHSregion)HHS Region 10  -1537.7     8469.0  -0.182  0.85616    
## factor(HHSregion)HHS Region 2   12394.6     8469.0   1.464  0.14542    
## factor(HHSregion)HHS Region 3   21227.1     8469.0   2.506  0.01326 *  
## factor(HHSregion)HHS Region 4   41563.4     8469.0   4.908 2.38e-06 ***
## factor(HHSregion)HHS Region 5   41014.5     8469.0   4.843 3.16e-06 ***
## factor(HHSregion)HHS Region 6    9131.3     8469.0   1.078  0.28267    
## factor(HHSregion)HHS Region 7    -418.7     8469.0  -0.049  0.96063    
## factor(HHSregion)HHS Region 8   -5796.9     8469.0  -0.684  0.49473    
## factor(HHSregion)HHS Region 9   23191.0     8469.0   2.738  0.00692 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23950 on 150 degrees of freedom
## Multiple R-squared:  0.3333, Adjusted R-squared:  0.2933 
## F-statistic: 8.333 on 9 and 150 DF,  p-value: 5.078e-10